Env setupu¶

In [1]:
import pandas as pd
import socket
import os
import yaml
import sys
import statsmodels.api as sm
from scipy import stats
import pyranges as pr
from pyliftover import LiftOver
import seaborn as sns
import scanpy as sc
from matplotlib import rcParams
import matplotlib.pylab as plt
import numpy as np
import anndata as ad
rcParams['figure.figsize'] = 11.7,8.27

Configure paths¶

In [2]:
outdir = "../data/output"

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
colorsmap = dict(zip([i["newName"] for i in iPSC_lines_map.values()],[i["color"] for i in iPSC_lines_map.values()]))


figDir = "./figures"
if not os.path.exists(figDir):
   # Create a new directory because it does not exist
   os.makedirs(figDir)
    
vcfPath = "../data/jointGenotype.g.vcf"

#Path to cellranger gex-GRCh38-2020-A gtf
gtfPath = "../data/resources/genes.gtf"

#Path to kolf from HIPSCI
kolfGTRaw = "../data/resources/HPSI0114i-kolf_2.wgs.gatk.haplotype_caller.20170425.genotypes.vcf.gz"

# Path were BULKrna / WGS matched loci will be written
kolfWGS_Matched_loci  = outdir+"/WGS_Matched_loci.kolf.tsv"

hostRoot = "-".join(socket.gethostname().split('-')[0:2])

with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f: paths = yaml.load(f, Loader=yaml.FullLoader)

indir=paths["paths"]["indir"][hostRoot] outdir=paths["paths"]["outdir"][hostRoot] projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot] markers=paths["paths"]["markers"][hostRoot] score_genes_mod_path=paths["paths"]["score_genes_mod"][hostRoot]

with open(projectBaseDir+"/iPSC_lines_map.yaml", 'r') as f: iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]

with open(markers, 'r') as f: markers = yaml.load(f, Loader=yaml.FullLoader)["markers"]["CBOs"]

sys.path.append(score_genes_mod_path) from score_genes_mod import *

vcfPath = "../data/jointGenotype.g.vcf" kolfGT = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/KOLF.WGS.unflitered.hg38.bed" kolfGTRaw = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/external_genotypes/HPSI0114i-kolf_2.wgs.gatk.haplotype_caller.20170425.genotypes.vcf.gz" kolfWGS_Matched_loci = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/external_genotypes/WGS_Matched_loci.kolf.tsv" acghPath = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing_io/data/external_genotypes/GSA2019_414_025_v2_FinalReport.headless.txt" outDF = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing/downstreamAnalysis/ASE_FOCUS/WGSmappings.tsv" outCumulativeASE = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing/downstreamAnalysis/ASE_FOCUS/CumulativeASE.tsv" outMappedVariants = "/group/testa/Users/davide.castaldi/organoidMultiplexing/organoidMultiplexing/downstreamAnalysis/ASE_FOCUS/MappedVariants.tsv"

LiftOver function¶

In [3]:
def liftOverDF(inDF, chrKey,posKey,indexerKey, startGenome, targetGenome):

    lo = LiftOver(startGenome, targetGenome)


    Liftedvariants = []
    
    __inDF = inDF.copy()
    
    
    if not __inDF[chrKey].head(1).tolist()[0].startswith("chr"):
        chrRoot = False
        __inDF[chrKey] = "chr"+__inDF[chrKey].astype(str)
    else:
        chrRoot = True
    

    for row in __inDF.index.tolist():

        pos = __inDF.loc[row]
        
        
        posLiftedStart = lo.convert_coordinate(str(pos[chrKey]), pos[posKey])

        if (len(posLiftedStart) == 1) & (posLiftedStart is not None):

            Liftedvariants.append([posLiftedStart[0][0], posLiftedStart[0][1],  row])
            
    LiftedEnhancers = pd.DataFrame(Liftedvariants, columns=["lifted_"+chrKey,"lifted_"+posKey,"indexRow"])
    LiftedEnhancers.index = LiftedEnhancers["indexRow"]
    
    outDF = pd.merge(__inDF,LiftedEnhancers, how="inner", left_index=True, right_index=True)
    outDF["old_"+chrKey+"_"+str(startGenome)] = outDF[chrKey].astype(str).str.split("chr", expand = True)[1] if not chrRoot else outDF[chrKey]
    outDF["old_"+posKey+"_"+str(startGenome)] = outDF[posKey]
    
    
    outDF[chrKey+"_"+str(targetGenome)] = outDF["lifted_"+chrKey].astype(str).str.split("chr", expand = True)[1] if not chrRoot else outDF["lifted_"+chrKey].astype(str)
    outDF[posKey+"_"+str(targetGenome)] = outDF["lifted_"+posKey]
    outDF["old_"+indexerKey+"_"+str(startGenome)] = outDF[indexerKey].str.split("chr", expand = True)

    outDF[indexerKey+"_"+str(targetGenome)] = outDF[chrKey+"_"+str(targetGenome)]+"_"+outDF["lifted_"+posKey].astype(str)
    
    outDF.drop(columns=["lifted_"+chrKey,"lifted_"+posKey,chrKey,posKey,indexerKey], inplace=True)


    return(outDF)

Import VCF and Map snps to genes¶

In [4]:
def get_vcf_names(vcf_path):
    with open(vcf_path, "rt") as ifile:
          for line in ifile:
            if line.startswith("#CHROM"):
                  vcf_names = [x for x in line.split('\t')]
                  break
    ifile.close()
    return vcf_names

names = [i.replace("#","").replace('\n','') for i in get_vcf_names(vcfPath)] 
mappings = {k: v for d in [{k:iPSC_lines_map[k]["newName"]} for k in list(iPSC_lines_map.keys())] for k, v in d.items()}


vcf = pd.read_csv(vcfPath, sep="\t", skip_blank_lines=True,comment='#', names=names).rename(columns = mappings)

MappedVars = vcf.copy()
MappedVars["Chromosome"] = "chr"+vcf["CHROM"].astype(str)
MappedVars["End"] = MappedVars["POS"]
MappedVars["Start"] = MappedVars["POS"]
MappedVars = MappedVars[["Chromosome","Start","End","CHROM"]]
MappedVars = pr.PyRanges(MappedVars)
MappedVars

#Prepare gtf for overlap, we exclude non genes features and keep only coding genes
gtf = pd.read_csv(gtfPath, sep="\t", comment='#', header=None)
gtf = gtf[gtf[2] == "gene"]
gtf['GENE'] = gtf[8].str.extract(r'gene_name "(.+?)";')
gtf['GENEtype'] = gtf[8].str.extract(r'gene_type "(.+?)";')
gtf = gtf[gtf['GENEtype'] == "protein_coding"]
gtf = gtf[[0,3,4,"GENE"]]
gtf.columns = ["Chromosome","Start","End","SYMBOL"]
gtf = pr.PyRanges(gtf)
gtf

MappedVars = MappedVars.join(gtf).df.drop_duplicates(subset=["Chromosome","Start","End"]).drop(columns=["Start_b","End_b"])
MappedVars.index = MappedVars["CHROM"].astype(str)+"_"+MappedVars["Start"].astype(str)
MappedVars["indexer"] = MappedVars.index.tolist()

vcf.index  = vcf["CHROM"].astype(str)+"_"+vcf["POS"].astype(str)
/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3169: DtypeWarning: Columns (0) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
In [5]:
MappedVars
Out[5]:
Chromosome Start End CHROM SYMBOL indexer
1_944296 chr1 944296 944296 1 SAMD11 1_944296
1_944307 chr1 944307 944307 1 SAMD11 1_944307
1_944531 chr1 944531 944531 1 SAMD11 1_944531
1_946247 chr1 946247 946247 1 NOC2L 1_946247
1_946538 chr1 946538 946538 1 NOC2L 1_946538
... ... ... ... ... ... ...
Y_12709406 chrY 12709406 12709406 Y USP9Y Y_12709406
Y_12914649 chrY 12914649 12914649 Y DDX3Y Y_12914649
Y_12914655 chrY 12914655 12914655 Y DDX3Y Y_12914655
Y_13251168 chrY 13251168 13251168 Y UTY Y_13251168
Y_13395391 chrY 13395391 13395391 Y UTY Y_13395391

83720 rows × 6 columns

Ase main function¶

In [6]:
def aseByObs(adataVar, genoOBS, geno, threshold, fdr,vcf, celltypeOBS, mappedVariants,groupby=None,groups=None ):
    """
    The whole function pivots around celltypes division, that is in our opinion the main covariate that cannot be neglected during ASE exploration
    genoOBS = is the column containing the the second covariate to isolate
    
    geno = is the genotype to isolate the analysis on (if want to do the analysis on several genotypes you have to run 
    multiple times with different geno value)
    the name should be one of the columns of VCF file
    * If your sample contains only 1 genotype you should still use genoOBS, geno arguments (provide mock column with same value)
    * Instead if you want to aggregate from multiple genotype (ignoring genoOBS covariate isolation at your risk) 
    set genoOBS=yourGenoObs and geno=None
    
    threshold = minReads for a locus to be tested
    mappedVariants = secondary DF that contains complementary infos to vcf (rs, gene mappin etc)
    
    """
    
    #First we keep only SNPs mapped to genes    
    vcf = vcf.loc[mappedVariants.index.tolist()]
    
    if type(groups) != list:
        groups = [groups]

    if geno != None:        
        hetGeno=vcf.loc[vcf[geno].str.split(":", expand = True)[0] == '0/1',["CHROM","POS"]].astype(str).apply('_'.join, axis=1).tolist()
        print(len(hetGeno))

        # Select for ID, obs and Het loci from bulk
        if groupby != None:
            varAdata_DS_geno = varAdata[
            ((varAdata.obs[genoOBS] == geno) & (varAdata.obs[groupby].isin(groups))),
            list(set(hetGeno).intersection(set(varAdata.var_names.tolist())))
            ]
        else:
            varAdata_DS_geno = varAdata[varAdata.obs[genoOBS] == geno,
            list(set(hetGeno).intersection(set(varAdata.var_names.tolist())))
            ]
    elif geno == None:
        
        #Aggregate information from all genotypes after checking they are all het at the given locus 
        GenoAdataDict = {}
        for g in varAdata.obs[genoOBS].unique():

            hetGeno = vcf.loc[vcf[g].str.contains("0/1"),g].index.tolist()
            hetGeno = list(set(hetGeno).intersection(set(varAdata.var_names.tolist())))

            GenoAdataDict[g] = varAdata[varAdata.obs[genoOBS] == g,hetGeno]

        varAdata_DS_geno = ad.concat(list(GenoAdataDict.values()), join="outer")
        del GenoAdataDict
            
            
    
    # isolate ref and Alt reads by leiden    
    # Sum by leiden
    AltReadsDF = pd.DataFrame([pd.Series((varAdata_DS_geno[varAdata_DS_geno.obs[celltypeOBS] == cl].layers["AltReads"].sum(axis = 0).flatten().A1),index=varAdata_DS_geno.var_names) for cl in varAdata_DS_geno.obs[celltypeOBS].unique()], index=varAdata_DS_geno.obs[celltypeOBS].unique()).T
    RefReadsDF = pd.DataFrame([pd.Series((varAdata_DS_geno[varAdata_DS_geno.obs[celltypeOBS] == cl].layers["RefReads"].sum(axis = 0).flatten().A1),index=varAdata_DS_geno.var_names) for cl in varAdata_DS_geno.obs[celltypeOBS].unique()], index=varAdata_DS_geno.obs[celltypeOBS].unique()).T
    TotalDF = AltReadsDF + RefReadsDF
    
    ASEdf = pd.DataFrame()
    
    # Perform cluster-wise thresholding and test 
    for cl in TotalDF.columns:
        MinCountsLoci = TotalDF[TotalDF[cl] > threshold].index.tolist()
        testDF = pd.DataFrame({"AltReads":AltReadsDF.loc[MinCountsLoci,cl], 
                      "RefReads":RefReadsDF.loc[MinCountsLoci, cl]})
        #print(np.array(testDF))

        result = testDF.apply(lambda x: stats.binom_test(
            np.array([x["AltReads"],x["RefReads"]]),
            p=.5),
            axis=1)
                
        if len(result) > 0:
            result = result.to_frame(name="pVal")
            print("Cluster ",cl)
            print("Total tested loci :",testDF.shape[0])
            print("NonCorrected p < 0.05 :",(result["pVal"] < 0.05).sum())

            fdrCorr = sm.stats.multipletests(result["pVal"], method='fdr_bh', alpha=fdr,  is_sorted=False)
            result["Significant"] = fdrCorr[0]
            result["FDR"] = fdrCorr[1]


            print("Corrected FDR < 0.05 :",result["Significant"].sum())
            clDF = result.merge(right=mappedVariants[["SYMBOL","indexer"]], 
                                          left_index=True,right_index=True )
            print("""
            """)
            clDF["celltype"] = cl
            clDF["AltReads"] = AltReadsDF.loc[clDF.index,cl]
            clDF["RefReads"] = RefReadsDF.loc[clDF.index,cl]
            clDF["Prop"] = clDF["AltReads"] / (clDF["AltReads"] + clDF["RefReads"])


            ASEdf = pd.concat([ASEdf, clDF], ignore_index=True).sort_values("FDR", ascending = True)
    print("Total tested loci :",len(ASEdf.indexer.unique()))
    print("Total significant scored loci :",ASEdf.Significant.sum())
    
    return(ASEdf)

var anndata¶

In [7]:
varAdata = sc.read_h5ad(outdir+"/adatas/VarAnndata_complete.h5ad")
In [8]:
non_chimeric = varAdata.obs.loc[varAdata.obs["type"] == "non_chimeric","dataset"].unique().tolist()
print(non_chimeric)
chimeric = varAdata.obs.loc[varAdata.obs["type"] == "chimeric","dataset"].unique().tolist()
print(chimeric)
['DownD50', 'DownD100', 'DownD250']
['UpD50', 'UpD300', 'UpD100_2', 'UpD100_1']

CTL08A to match available WGS¶

In [9]:
ASE = aseByObs(adataVar=varAdata,genoOBS="cellID_newName", geno="CTL08A",celltypeOBS="leidenAnnotated",threshold= 50,fdr=0.05 ,vcf=vcf,mappedVariants=MappedVars)
ASE
21543
Cluster  Neurons
Total tested loci : 469
NonCorrected p < 0.05 : 181
Corrected FDR < 0.05 : 134

            
Cluster  CajalR_like
Total tested loci : 357
NonCorrected p < 0.05 : 134
Corrected FDR < 0.05 : 106

            
Cluster  RadialGliaProgenitors
Total tested loci : 247
NonCorrected p < 0.05 : 94
Corrected FDR < 0.05 : 64

            
Cluster  GlutamatergicNeurons_early
Total tested loci : 558
NonCorrected p < 0.05 : 237
Corrected FDR < 0.05 : 185

            
Cluster  ProliferatingProgenitors
Total tested loci : 343
NonCorrected p < 0.05 : 125
Corrected FDR < 0.05 : 100

            
Cluster  MigratingNeurons
Total tested loci : 359
NonCorrected p < 0.05 : 127
Corrected FDR < 0.05 : 93

            
Cluster  GlutamatergicNeurons_late
Total tested loci : 152
NonCorrected p < 0.05 : 59
Corrected FDR < 0.05 : 42

            
Cluster  OuterRadialGliaAstrocytes
Total tested loci : 204
NonCorrected p < 0.05 : 77
Corrected FDR < 0.05 : 57

            
Cluster  Interneurons
Total tested loci : 45
NonCorrected p < 0.05 : 20
Corrected FDR < 0.05 : 19

            
Total tested loci : 712
Total significant scored loci : 800
Out[9]:
pVal Significant FDR SYMBOL indexer celltype AltReads RefReads Prop
0 0.000000e+00 True 0.000000e+00 UBC 12_124913268 Neurons 0.0 1166.0 0.000000
1 0.000000e+00 True 0.000000e+00 UBC 12_124913268 GlutamatergicNeurons_early 0.0 1297.0 0.000000
2 1.139238e-305 True 4.089864e-303 UBC 12_124913268 MigratingNeurons 0.0 1014.0 0.000000
3 2.078230e-274 True 7.419281e-272 UBC 12_124913268 CajalR_like 1.0 919.0 0.001087
4 2.491799e-206 True 8.546870e-204 UBC 12_124913268 ProliferatingProgenitors 0.0 684.0 0.000000
... ... ... ... ... ... ... ... ... ...
2594 9.110721e-01 False 1.000000e+00 PNMA1 14_73712097 RadialGliaProgenitors 39.0 41.0 0.487500
2595 9.196068e-01 False 1.000000e+00 DDX17 22_38483683 RadialGliaProgenitors 48.0 50.0 0.489796
2596 1.000000e+00 False 1.000000e+00 AURKAIP1 1_1374025 RadialGliaProgenitors 147.0 146.0 0.501706
2598 1.000000e+00 False 1.000000e+00 TOMM22 22_38684240 GlutamatergicNeurons_early 164.0 165.0 0.498480
2582 1.000000e+00 False 1.000000e+00 CDK1 10_60794716 ProliferatingProgenitors 110.0 109.0 0.502283

2734 rows × 9 columns

Here we axtract loci from KOLF predicted ASE to check if those are actually heterozygous in WGS¶

In [10]:
LociTocheck = ASE.loc[ASE.Significant,"indexer"].unique().tolist()

LociTocheck = vcf.loc[LociTocheck,["REF","ALT", mappings["KOLF"]]].rename(columns={mappings["KOLF"]:"{}Genotype_bulkRNA".format(mappings["KOLF"]),
                                                                                  "REF":"bulkRNA_REF","ALT":"bulkRNA_ALT"})
LociTocheck.index.name = None
LociTocheck["chr"] = [i.split("_")[0] for i in LociTocheck.index.tolist()]
LociTocheck["pos"] = [int(i.split("_")[1]) for i in LociTocheck.index.tolist()]
LociTocheck["indexer"] = LociTocheck.index.tolist()

LociTocheck = liftOverDF(LociTocheck, "chr","pos","indexer", "hg38","hg19")
LociTocheck

LociTocheck[["chr_hg19","pos_hg19"]].to_csv(outdir+"/LociTocheckList.tsv", sep="\t", index=None, header=None)
lociTocheckPath = outdir+"/LociTocheckList.tsv"
In [11]:
## The WGS genotype file is big so we implemented the match via awk
In [12]:
%%bash -s "$lociTocheckPath" "$kolfGTRaw"  "$kolfWGS_Matched_loci"

awk -v FS="\t" 'NR==FNR{a[$1,$2];next} ($1,$2) in a' $1 <(zcat $2 ) | cut -f1,2,3,4,5,6,7,10 > $3
In [13]:
chrom_wgsDF = pd.read_csv(kolfWGS_Matched_loci, header=None, sep="\t", names=["chr_wgs_hg19","pos_wgs_hg19","rs_wgs","ref_wgs","alt_wgs","q_wgs","filt_wgs","{}Genotype_WGS".format(mappings["KOLF"])])
chrom_wgsDF["indexer_hg19"] =  chrom_wgsDF["chr_wgs_hg19"].astype(str)+"_"+chrom_wgsDF["pos_wgs_hg19"].astype(str)
chrom_wgsDF["alt_wgs"] = chrom_wgsDF["alt_wgs"].str.split(",", expand = True)[0].replace({"<NON_REF>":"."})
chrom_wgsDF
Out[13]:
chr_wgs_hg19 pos_wgs_hg19 rs_wgs ref_wgs alt_wgs q_wgs filt_wgs CTL08AGenotype_WGS indexer_hg19
0 1 1309405 rs150194697 T C 474.77 . 0/1:11,14,0:25:99:503,0,359,536,401,936:6,5,4,10 1_1309405
1 1 3697035 rs17411356 A G 692.77 . 0/1:13,22,0:35:99:721,0,425,760,491,1251:6,7,14,8 1_3697035
2 1 10521200 . T . . . 0/0:31:76:31:0,76,1027 1_10521200
3 1 10521238 . T . . . 0/0:34:30:34:0,30,1164 1_10521238
4 1 12628188 rs4580 T C 490.77 . 0/1:13,14,0:27:99:519,0,433,558,475,1033:6,7,6,8 1_12628188
... ... ... ... ... ... ... ... ... ...
357 22 42908913 . A . . . 0/0:21:18:21:0,18,664 22_42908913
358 22 44220673 rs138056 C A 274.77 . 0/1:15,10,0:25:99:303,0,570,348,599,947:7,8,4,6 22_44220673
359 22 49146299 rs695689 G A 154.77 . 0/1:11,6,0:17:99:183,0,413,217,431,648:4,7,3,3 22_49146299
360 22 50962208 rs12148 T G 693.77 . 0/1:13,19,0:32:99:722,0,419,761,476,1237:4,9,5,14 22_50962208
361 X 24095220 . A . . . 0/0:20:57:20:0,57,855 X_24095220

362 rows × 9 columns

In [14]:
LociTocheck = LociTocheck.merge(chrom_wgsDF, on="indexer_hg19", how = "left")

mergedDF = ASE[ASE.Significant].rename(columns={"indexer":"old_indexer_hg38"}).merge(LociTocheck, on="old_indexer_hg38")
mergedDF

mergeDF = mergedDF[["indexer_hg19","old_indexer_hg38","FDR","old_chr_hg38", 
                    "old_pos_hg38","pos_hg19","chr_hg19", 'SYMBOL',
                    'celltype', 'AltReads', 'RefReads', 'Prop', 'bulkRNA_REF','bulkRNA_ALT', "{}Genotype_bulkRNA".format(mappings["KOLF"]),
                    'chr_wgs_hg19', 'pos_wgs_hg19', 'rs_wgs', 'ref_wgs', 'alt_wgs', "{}Genotype_WGS".format(mappings["KOLF"])]]
                    
                   
mergeDF = mergeDF.sort_values(["{}Genotype_WGS".format(mappings["KOLF"])],ascending=True, na_position='last').drop_duplicates('indexer_hg19', keep='first')
In [15]:
print("{} out of {} loci were not found in WGS".format(mergeDF.CTL08AGenotype_WGS.isna().sum(), mergeDF.shape[0]))
print("For rate of {} percent non mapping loci".format(round(mergeDF.CTL08AGenotype_WGS.isna().sum() / mergeDF.shape[0]*100,1)))
38 out of 400 loci were not found in WGS
For rate of 9.5 percent non mapping loci

Many ASE genes were Housekeeping genes, so we check Check WGS - Bulk rna coherence¶

Check for housekeeping¶

In [16]:
mergeDF[mergeDF.SYMBOL.isin(["UBC","GAPDH","ACTB"])][["SYMBOL","chr_hg19","chr_wgs_hg19","pos_hg19","pos_wgs_hg19","rs_wgs","CTL08AGenotype_bulkRNA","CTL08AGenotype_WGS","bulkRNA_REF","ref_wgs","bulkRNA_ALT","alt_wgs"]].dropna()
Out[16]:
SYMBOL chr_hg19 chr_wgs_hg19 pos_hg19 pos_wgs_hg19 rs_wgs CTL08AGenotype_bulkRNA CTL08AGenotype_WGS bulkRNA_REF ref_wgs bulkRNA_ALT alt_wgs
537 ACTB 7 7 5567112 5567112.0 rs7612 0/1:650,524:1174:99:16596,0,17870 0/1:23,15,0:38:99:496,0,778,565,823,1387:11,12... C C T T
164 ACTB 7 7 5567020 5567020.0 rs138738998 0/1:494,457:951:99:15017,0,14961 0/1:23,17,0:40:99:534,0,831,602,882,1484:13,10... C C T T
189 UBC 12 12 125397364 125397364.0 rs8963 0/1:94,352:446:99:10810,0,2230 1/1:2,20,0:22:16:0|1:125397358_T_C:606,16,0,61... A A G G

Sankey category¶

In [17]:
mergeDFUnique = mergeDF.drop_duplicates('indexer_hg19', keep='first')
mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] = mergeDFUnique["CTL08AGenotype_bulkRNA"].str.split(":", expand = True)[0]
mergeDFUnique["CTL08AGenotype_WGS_sankey"] = mergeDFUnique["CTL08AGenotype_WGS"].str.split(":", expand = True)[0]
mergeDFUnique = mergeDFUnique.fillna("NotCovered")
mergeDFUnique["CTL08AGenotype_WGS_sankey"] = "WGS_"+mergeDFUnique["CTL08AGenotype_WGS_sankey"]
mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] = "RNA_"+mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"]


import holoviews as hv
obsTupleToMap = ("CTL08AGenotype_bulkRNA_sankey","CTL08AGenotype_WGS_sankey")
SankeyDF=mergeDFUnique[list(obsTupleToMap)]
SankeyDF.columns = [obsTupleToMap[0],obsTupleToMap[1]]
SankeyDF = SankeyDF.groupby([obsTupleToMap[0],obsTupleToMap[1]]).size().reset_index().rename(columns={0:'count'})
SankeyDF=SankeyDF[SankeyDF["count"] != 0]
hv.extension('bokeh')

sankey1 = hv.Sankey(SankeyDF, kdims=[obsTupleToMap[0],obsTupleToMap[1]], vdims="count")


sankey1.opts(label_position='outer',
edge_color=obsTupleToMap[0], edge_line_width=0,
node_alpha=1.0, node_width=40, node_sort=False,
width=1100, height=700, bgcolor="white")
Out[17]:

Mapping of base edits¶

In [18]:
Concordant = mergeDF.loc[(mergeDF["CTL08AGenotype_bulkRNA"].str.split(":", expand = True)[0] == mergeDF["CTL08AGenotype_WGS"].str.split(":", expand = True)[0])]
Concordant["BaseEdit"] = Concordant["bulkRNA_REF"]+">"+Concordant["bulkRNA_ALT"]
Concordant = pd.DataFrame(Concordant.groupby(["BaseEdit"], as_index=False).size()).rename(columns={0:"counts"})
Concordant["Agreement"] = "concordant"

Discordant = mergeDF.loc[~(mergeDF["CTL08AGenotype_bulkRNA"].str.split(":", expand = True)[0] == mergeDF["CTL08AGenotype_WGS"].str.split(":", expand = True)[0])].dropna()
Discordant["BaseEdit"] = Discordant["bulkRNA_REF"]+">"+Discordant["bulkRNA_ALT"]
Discordant = pd.DataFrame(Discordant.groupby(["BaseEdit"], as_index=False).size()).rename(columns={0:"counts"})
Discordant["Agreement"] = "Discordant"


Unmapped = mergeDF.loc[mergeDF["CTL08AGenotype_WGS"].isna()]
Unmapped["BaseEdit"] = Unmapped["bulkRNA_REF"]+">"+Unmapped["bulkRNA_ALT"]
Unmapped = pd.DataFrame(Unmapped.groupby(["BaseEdit"], as_index=False).size()).rename(columns={0:"counts"})
Unmapped["Agreement"] = "Unmapped"

PieDF = pd.concat([Concordant, Discordant, Unmapped], axis = 0, ignore_index=True)


sankey1 = hv.Sankey(PieDF, kdims=["Agreement","BaseEdit"], vdims="size")


sankey1.opts(label_position='outer',
edge_color="BaseEdit", edge_line_width=1,
node_alpha=1.0, node_width=40, node_sort=False,
width=1100, height=700, bgcolor="white")
<ipython-input-1-c45f1868a562>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Concordant["BaseEdit"] = Concordant["bulkRNA_REF"]+">"+Concordant["bulkRNA_ALT"]
<ipython-input-1-c45f1868a562>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Unmapped["BaseEdit"] = Unmapped["bulkRNA_REF"]+">"+Unmapped["bulkRNA_ALT"]
Out[18]:

Bulk RNA MAF Plot¶

In [19]:
AllelerateDisCordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/0" ),"CTL08AGenotype_bulkRNA"].str.split(":", expand=True)[1]
AllelerateDisCordant = AllelerateDisCordant.to_frame("MAF")
AllelerateDisCordant["Concordant"] = "Discordant"
AllelerateDisCordant

AllelerateConcordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/1" ),"CTL08AGenotype_bulkRNA"].str.split(":", expand=True)[1]
AllelerateConcordant = AllelerateConcordant.to_frame("MAF")
AllelerateConcordant["Concordant"] = "Concordant"
AllelerateConcordant

DistributionAgreement = pd.concat([AllelerateConcordant,AllelerateDisCordant])
#DistributionAgreement.str.split(",", expand = True)[1].astype(int) / (Allelerate.str.split(",", expand = True)[0].astype(int)+Allelerate.str.split(",", expand = True)[1].astype(int))

DistributionAgreement["MAF"] =DistributionAgreement.MAF.str.split(",", expand = True)[1].astype(int) / (DistributionAgreement.MAF.str.split(",", expand = True)[0].astype(int)+DistributionAgreement.MAF.str.split(",", expand = True)[1].astype(int))

sns.histplot(DistributionAgreement, x= "MAF", hue="Concordant", bins=50)
Out[19]:
<AxesSubplot:xlabel='MAF', ylabel='Count'>

SC RNA MAF Plot¶

In [20]:
AllelerateDisCordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/0" ),"Prop"]
AllelerateDisCordant = AllelerateDisCordant.to_frame("Prop")
AllelerateDisCordant["Concordant"] = "Discordant"


AllelerateConcordant = mergeDFUnique.loc[(mergeDFUnique["CTL08AGenotype_bulkRNA_sankey"] == "RNA_0/1") & (mergeDFUnique["CTL08AGenotype_WGS_sankey"] == "WGS_0/1" ),"Prop"]
AllelerateConcordant = AllelerateConcordant.to_frame("Prop")
AllelerateConcordant["Concordant"] = "Concordant"
AllelerateConcordant

DistributionAgreement = pd.concat([AllelerateConcordant,AllelerateDisCordant])
#DistributionAgreement.str.split(",", expand = True)[1].astype(int) / (Allelerate.str.split(",", expand = True)[0].astype(int)+Allelerate.str.split(",", expand = True)[1].astype(int))


sns.histplot(DistributionAgreement, x= "Prop", hue="Concordant", bins=50)
Out[20]:
<AxesSubplot:xlabel='Prop', ylabel='Count'>

In the next section we assess the coherence in expressed allele across genotypes since relatives alleleles inheritance is unknown¶

In [21]:
ASE2 = aseByObs(varAdata, genoOBS="cellID_newName" ,celltypeOBS="leidenAnnotated" , geno=None,threshold= 50,fdr=0.05 ,vcf=vcf,mappedVariants=MappedVars)
ASE2 = ASE2[ASE2.Significant]
Cluster  CajalR_like
Total tested loci : 1189
NonCorrected p < 0.05 : 486
Corrected FDR < 0.05 : 381

            
Cluster  Neurons
Total tested loci : 2472
NonCorrected p < 0.05 : 932
Corrected FDR < 0.05 : 724

            
Cluster  Interneurons
Total tested loci : 988
NonCorrected p < 0.05 : 415
Corrected FDR < 0.05 : 340

            
Cluster  GlutamatergicNeurons_early
Total tested loci : 1392
NonCorrected p < 0.05 : 550
Corrected FDR < 0.05 : 441

            
Cluster  ProliferatingProgenitors
Total tested loci : 2211
NonCorrected p < 0.05 : 854
Corrected FDR < 0.05 : 655

            
Cluster  MigratingNeurons
Total tested loci : 1298
NonCorrected p < 0.05 : 466
Corrected FDR < 0.05 : 337

            
Cluster  OuterRadialGliaAstrocytes
Total tested loci : 1493
NonCorrected p < 0.05 : 543
Corrected FDR < 0.05 : 415

            
Cluster  RadialGliaProgenitors
Total tested loci : 1611
NonCorrected p < 0.05 : 598
Corrected FDR < 0.05 : 467

            
Cluster  GlutamatergicNeurons_late
Total tested loci : 2331
NonCorrected p < 0.05 : 869
Corrected FDR < 0.05 : 684

            
Total tested loci : 3718
Total significant scored loci : 4444
In [22]:
def ASEFlipTest(varAdata, ASEdf, minDev, minCounts ):

    FinalDF = pd.DataFrame(index = ASEdf["celltype"].unique(), columns=["Loci_Het_in_Atl2Genotypes","Loci_Het_and_inconsistent_in_Atl2Genotypes"])
    inconsistentLoci = {}
    CountsDF = pd.DataFrame()


    for cellType in ASEdf["celltype"].unique():
    #for cellType in ["Neurons"]:
        #print("{} not consistent loci".format(cellType))


        ASEdfcl = ASEdf[ASEdf["celltype"] == cellType]

        testDF = pd.DataFrame(index=ASEdfcl.indexer.unique().tolist())

        for g in varAdata.obs.cellID_newName.unique():
            p1 = vcf.loc[(ASEdfcl.indexer.unique().tolist())]
            p1 = p1[p1[g].str.contains("0/1")].index.tolist()

            t = varAdata[varAdata.obs.cellID_newName == g,p1]
            total = (t.layers["RefReads"].sum(axis = 0).flatten().A1 + t.layers["AltReads"].sum(axis = 0).flatten().A1)



            t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total)
            t = pd.Series(t, index=p1)

            t = t[total >= minCounts]


            testDF[g] = t

        #Print counts of incoherent loci per cell type
        inconsistentLoci = testDF[(((testDF >= .5+minDev).sum(axis = 1) > 0) & ((testDF <= .5-minDev).sum(axis = 1)))].index.tolist()  
        LociToIDMap = testDF.loc[inconsistentLoci].apply(lambda row: row[row.notna()].index.tolist(), axis = 1).to_dict()

        CountsDFlocal = pd.DataFrame(index = list(LociToIDMap.keys()))
        CountsDFlocal["locus"] = LociToIDMap.keys()
        CountsDFlocal["celltype"] = cellType

        for i in LociToIDMap.items():

            varAdataSS = varAdata[:,i[0]]

            for g in i[1]:

                #CountsDFlocal.loc[i[0],"{}_RefReads".format(g)] = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["RefReads"].sum()
                #CountsDFlocal.loc[i[0],"{}_AltReads".format(g)] = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["AltReads"].sum()

                Refsum = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["RefReads"].sum()
                AltSum = varAdataSS[varAdataSS.obs.cellID_newName == g].layers["AltReads"].sum()

                totalSum = Refsum + AltSum

                bVal = AltSum / totalSum

                CountsDFlocal.loc[i[0],"{}_bval|counts".format(g)] = str(round(bVal,2))+"|"+str(round(totalSum))


        CountsDF = pd.concat([CountsDF, CountsDFlocal], ignore_index=True)

        #Store results in summary table
        FinalDF.loc[cellType,"Loci_Het_in_Atl2Genotypes"] =  (testDF.notna().sum(axis = 1) > 1).sum()
        FinalDF.loc[cellType,"Loci_Het_and_inconsistent_in_Atl2Genotypes"] =  (((testDF >= .5+minDev).sum(axis = 1) > 0) & ((testDF <= .5-minDev).sum(axis = 1))).sum()
        
        CountsDF.set_index(["locus"]).merge(MappedVars,left_index=True,right_index=True ).iloc[:,[0,1,2,3,4,7]]
        
    return CountsDF, FinalDF
In [23]:
AllelicCounts, SummaryDF = ASEFlipTest(varAdata, ASE2, minDev=.05, minCounts=20 )
<ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide
  t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total)
<ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide
  t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total)
<ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide
  t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total)
<ipython-input-1-8fd354d71ee2>:26: RuntimeWarning: invalid value encountered in true_divide
  t = (t.layers["AltReads"].sum(axis = 0).flatten().A1 / total)
In [24]:
AllelicCounts
Out[24]:
locus celltype CTL01_bval|counts CTL08A_bval|counts CTL02A_bval|counts CTL04E_bval|counts
0 22_48750487 CajalR_like 0.44|61 0.56|118 NaN NaN
1 17_16342702 Neurons 0.55|2482 0.54|2310 0.43|9785 NaN
2 12_76027474 Neurons 0.04|93 NaN 0.74|810 NaN
3 1_161214307 Neurons NaN 0.7|190 0.31|905 NaN
4 6_87529548 Neurons NaN NaN 0.44|315 0.56|27
5 2_97549448 Neurons 0.3|30 NaN 0.68|537 NaN
6 20_25452900 Neurons 0.42|137 0.5|207 0.57|440 NaN
7 16_70162281 Neurons 0.52|46 0.55|47 0.27|228 NaN
8 14_64052737 Neurons 0.45|47 0.6|40 0.45|883 NaN
9 2_28802613 Neurons 0.57|194 0.54|270 0.58|699 0.33|73
10 4_87529065 OuterRadialGliaAstrocytes 0.7|23 NaN 0.32|1171 NaN
11 16_70162281 OuterRadialGliaAstrocytes 0.52|46 0.55|47 0.27|228 NaN
12 14_69354229 OuterRadialGliaAstrocytes NaN 0.61|281 0.45|350 NaN
13 1_161214307 OuterRadialGliaAstrocytes NaN 0.7|190 0.31|905 NaN
14 17_16342702 OuterRadialGliaAstrocytes 0.55|2482 0.54|2310 0.43|9785 NaN
15 14_64052737 OuterRadialGliaAstrocytes 0.45|47 0.6|40 0.45|883 NaN
16 6_149632845 OuterRadialGliaAstrocytes 0.57|110 NaN 0.46|584 0.44|81
17 17_16342702 ProliferatingProgenitors 0.55|2482 0.54|2310 0.43|9785 NaN
18 1_161214307 ProliferatingProgenitors NaN 0.7|190 0.31|905 NaN
19 11_65717866 ProliferatingProgenitors 0.6|62 NaN 0.51|138 0.44|43
20 11_116748357 ProliferatingProgenitors NaN 0.58|159 0.44|474 0.59|82
21 12_120579368 ProliferatingProgenitors NaN 0.52|306 0.57|670 0.44|158
22 19_17582871 ProliferatingProgenitors 0.4|94 0.58|113 0.53|225 NaN
23 11_62662863 ProliferatingProgenitors NaN NaN 0.44|373 0.67|36
24 22_21701105 ProliferatingProgenitors 0.56|110 0.61|77 0.43|366 NaN
25 17_16342702 MigratingNeurons 0.55|2482 0.54|2310 0.43|9785 NaN
26 2_97549448 MigratingNeurons 0.3|30 NaN 0.68|537 NaN
27 16_88715206 MigratingNeurons 0.43|168 0.65|167 NaN NaN
28 2_37204503 MigratingNeurons 0.6|166 0.45|155 NaN NaN
29 2_28802613 RadialGliaProgenitors 0.57|194 0.54|270 0.58|699 0.33|73
30 12_120579368 RadialGliaProgenitors NaN 0.52|306 0.57|670 0.44|158
31 20_63247598 RadialGliaProgenitors 0.5|268 0.38|211 0.57|461 NaN
32 17_16342702 GlutamatergicNeurons_late 0.55|2482 0.54|2310 0.43|9785 NaN
33 12_76027474 GlutamatergicNeurons_late 0.04|93 NaN 0.74|810 NaN
34 2_97549448 GlutamatergicNeurons_late 0.3|30 NaN 0.68|537 NaN
35 1_161214307 GlutamatergicNeurons_late NaN 0.7|190 0.31|905 NaN
36 11_45240575 GlutamatergicNeurons_late 0.57|82 NaN 0.36|282 0.61|77
37 2_197496910 GlutamatergicNeurons_late 0.32|59 NaN 0.82|158 NaN
38 9_83978399 GlutamatergicNeurons_late 0.37|49 NaN 0.54|778 0.61|56
39 16_15084681 GlutamatergicNeurons_late 0.75|20 NaN 0.45|253 0.59|34
40 18_9256260 GlutamatergicNeurons_late 0.57|30 NaN 0.41|329 NaN
41 13_110641500 GlutamatergicNeurons_late NaN 0.59|41 0.39|324 0.58|38
42 2_97547721 GlutamatergicNeurons_late NaN 0.38|77 0.59|138 NaN
43 3_25638247 GlutamatergicNeurons_late 0.53|60 0.79|53 0.43|998 NaN
44 11_117285836 GlutamatergicNeurons_late 0.34|29 0.74|23 0.46|318 NaN
45 13_52689848 GlutamatergicNeurons_late 0.64|22 NaN 0.34|139 NaN
46 11_116748357 GlutamatergicNeurons_late NaN 0.58|159 0.44|474 0.59|82
47 17_16342702 Interneurons 0.55|2482 0.54|2310 0.43|9785 NaN
48 12_122865657 Interneurons 0.58|279 NaN 0.54|566 0.37|87
49 11_116748357 Interneurons NaN 0.58|159 0.44|474 0.59|82
50 1_161214307 Interneurons NaN 0.7|190 0.31|905 NaN
In [25]:
SummaryDF["cluster"] = SummaryDF.index
SummaryDF
Out[25]:
Loci_Het_in_Atl2Genotypes Loci_Het_and_inconsistent_in_Atl2Genotypes cluster
CajalR_like 165 1 CajalR_like
Neurons 291 9 Neurons
OuterRadialGliaAstrocytes 191 7 OuterRadialGliaAstrocytes
GlutamatergicNeurons_early 209 0 GlutamatergicNeurons_early
ProliferatingProgenitors 274 8 ProliferatingProgenitors
MigratingNeurons 164 4 MigratingNeurons
RadialGliaProgenitors 213 3 RadialGliaProgenitors
GlutamatergicNeurons_late 246 15 GlutamatergicNeurons_late
Interneurons 140 4 Interneurons
In [26]:
sns.barplot(data=SummaryDF.melt("cluster"), x="cluster", y="value", hue="variable")
plt.xticks(rotation=45)
Out[26]:
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 [Text(0, 0, 'CajalR_like'),
  Text(1, 0, 'Neurons'),
  Text(2, 0, 'OuterRadialGliaAstrocytes'),
  Text(3, 0, 'GlutamatergicNeurons_early'),
  Text(4, 0, 'ProliferatingProgenitors'),
  Text(5, 0, 'MigratingNeurons'),
  Text(6, 0, 'RadialGliaProgenitors'),
  Text(7, 0, 'GlutamatergicNeurons_late'),
  Text(8, 0, 'Interneurons')])
In [ ]: